TFLearn Fragment Detection

Catherine has prepared datafiles with sentences turned into fragments. I will use as input 60,000 fragments and 60,000 sentences. The fragments will come from the sentences. In the future the fragments will not be descendants of the input sentences. The labels will be either a 1 or 0, where 1 indicates a sentence and 0 indicates a fragment.

Install Dependencies



In [ ]:

    
import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical
import spacy
nlp = spacy.load('en')
import re
from nltk.util import ngrams, trigrams
import csv

Load Datafiles



In [ ]:

    
texts = []
labels = []

with open("../.removingPOS/updatedSentences/conjunctionSentences/detailedRemoval.txt","r") as f:
    for line in f:
        asArray = line.split(" ||| ")
        fragment = asArray[2].strip()
        fragment = re.sub("\ \.", ".", fragment)
        fragment = re.sub("\,\.", ".", fragment)
        texts.append(fragment.capitalize())
        labels.append(0)
        texts.append(asArray[0].strip())
        labels.append(1)
        
with open("../.removingPOS/updatedSentences/nounSentences/detailedRemoval.txt","r") as f:
    for line in f:
        asArray = line.split(" ||| ")
        fragment = asArray[2].strip()
        fragment = re.sub("\ \.", ".", fragment)
        fragment = re.sub("\,\.", ".", fragment)
        texts.append(fragment.capitalize())
        labels.append(0)
        texts.append(asArray[0].strip())
        labels.append(1)

with open("../.removingPOS/updatedSentences/nounverbSentences/detailedRemoval.txt","r") as f:
    for line in f:
        asArray = line.split(" ||| ")
        fragment = asArray[2].strip()
        fragment = re.sub("\ \.", ".", fragment)
        fragment = re.sub("\,\.", ".", fragment)
        texts.append(fragment.capitalize())
        labels.append(0)
        texts.append(asArray[0].strip())
        labels.append(1)
        
with open("../.removingPOS/updatedSentences/verbSentences/detailedRemoval.txt","r") as f:
    for line in f:
        asArray = line.split(" ||| ")
        fragment = asArray[2].strip()
        fragment = re.sub("\ \.", ".", fragment)
        fragment = re.sub("\,\.", ".", fragment)
        texts.append(fragment.capitalize())
        labels.append(0)
        texts.append(asArray[0].strip())
        labels.append(1)
        
print(texts[-10:])

Shuffle the data



In [ ]:

    
import random

combined = list(zip(texts,labels))
random.shuffle(combined)

texts[:], labels[:] = zip(*combined)
print(texts[-10:])
print(labels[-10:])

Get parts of speech for text string



In [ ]:

    
def textStringToPOSArray(text):
    doc = nlp(text)
    tags = []
    for word in doc:
        tags.append(word.pos_)
    return tags

textStringToPOSArray(texts[3])

Get POS trigrams for a text string



In [ ]:

    
def find_ngrams(input_list, n):
  return zip(*[input_list[i:] for i in range(n)])

def getPOSTrigramsForTextString(text):
    tags = textStringToPOSArray(text)
    tgrams = list(trigrams(tags))
    return tgrams

print("Text: ", texts[3], labels[3])
getPOSTrigramsForTextString(texts[3])

Turn Trigrams into Dict keys



In [ ]:

    
def trigramsToDictKeys(trigrams):
    keys = []
    for trigram in trigrams:
        keys.append('>'.join(trigram))
    return keys

print(texts[2])
print(trigramsToDictKeys(getPOSTrigramsForTextString(texts[2])))



In [ ]:

    
from collections import Counter

c = Counter()

for textString in texts:
    c.update(trigramsToDictKeys(getPOSTrigramsForTextString(textString)))

total_counts = c

print("Total words in data set: ", len(total_counts))



In [ ]:

    
vocab = sorted(total_counts, key=total_counts.get, reverse=True)[:1200]
print(vocab[:60])



In [ ]:

    
print(vocab[-1], ': ', total_counts[vocab[-1]])

Take the trigrams and index them



In [ ]:

    
word2idx = {n: i for i, n in enumerate(vocab)}## create the word-to-index dictionary here
print(word2idx)



In [ ]:

    
def textToTrigrams(text): 
    return trigramsToDictKeys(getPOSTrigramsForTextString(text))

def text_to_vector(text):
    wordVector = np.zeros(len(vocab))
    for word in textToTrigrams(text):
        index = word2idx.get(word, None)
        if index != None:
            wordVector[index] += 1
    return wordVector



In [ ]:

    
text_to_vector('The tea is for a party to celebrate '
               'the movie so she has no time for a cake')[:65]



In [ ]:

    
word_vectors = np.zeros((len(texts), len(vocab)), dtype=np.int_)
for ii, text in enumerate(texts):
    word_vectors[ii] = text_to_vector(text)



In [ ]:

    
# Printing out the first 5 word vectors
word_vectors[:5, :23]

Chunking the data for TF



In [ ]:

    
records = len(labels)
test_fraction = 0.9

train_split, test_split = int(records*test_fraction), int(records*test_fraction)
print(train_split, test_split)
trainX, trainY = word_vectors[:train_split], to_categorical(labels[:train_split], 2)
testX, testY = word_vectors[test_split:], to_categorical(labels[test_split:], 2)



In [ ]:

    
trainX[-1], trainY[-1]



In [ ]:

    
len(trainY), len(testY), len(trainY) + len(testY)

Setting up TF



In [ ]:

    
# Network building
def build_model():
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()
    
    #### Your code ####
    net = tflearn.input_data([None, len(vocab)])                          # Input
    net = tflearn.fully_connected(net, 200, activation='ReLU')      # Hidden
    net = tflearn.fully_connected(net, 25, activation='ReLU')      # Hidden
    net = tflearn.fully_connected(net, 2, activation='softmax')   # Output
    net = tflearn.regression(net, optimizer='sgd', learning_rate=0.1, loss='categorical_crossentropy')
    model = tflearn.DNN(net)

    return model



In [ ]:

    
len(vocab)

Initialize



In [ ]:

    
model = build_model()

Training



In [ ]:

    
# Training
model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=50)



In [ ]:

    
# Testing
predictions = (np.array(model.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)

Playground



In [ ]:

    
def test_sentence(sentence):
    positive_prob = model.predict([text_to_vector(sentence)])[0][1]
    print('Sentence: {}'.format(sentence))
    print('P(positive) = {:.3f} :'.format(positive_prob), 
          'Positive' if positive_prob > 0.5 else 'Negative')



In [ ]:

    
test_sentence("Even though he had the better arguments and was by far the more powerful speaker.")



In [ ]:

    
test_sentence("Even though he had the better arguments and was by far the more powerful speaker, Peter lost the debate.")



In [ ]:

    
test_sentence("Working far into the night in an effort to salvage her little boat.")



In [ ]:

    
test_sentence("She was working far into the night in an effort to salvage her little boat.")



In [ ]:

    
test_sentence("The man eating pizza.")



In [ ]:

    
test_sentence("The man eating pizza is overwieght.")



In [ ]:

    
test_sentence("While we were swimming at the lake.")



In [ ]:

    
test_sentence("While we were swimming at the lake, we saw a fish.")



In [ ]:

    
test_sentence("Keep going.")



In [ ]:

    
test_sentence("A time of wonder and amazement")



In [ ]:

    
test_sentence("That was a time of wonder and amazement")



In [ ]:

    
test_sentence("Since she never saw that movie.")



In [ ]:

    
test_sentence("We should invite her, since she never saw that movie.")



In [ ]:

    
test_sentence("Affecting the lives of many students in New York City.")



In [ ]:

    
test_sentence("Quill is affecting the lives of many students in New York City.")



In [ ]:

    
test_sentence("Standing on the edge of the cliff looking down.")



In [ ]:

    
test_sentence("I'm standing on the edge of the cliff and looking down.")



In [ ]:

    
test_sentence("The team looked forward to victory.")



In [ ]:

    
model.save("./model.tfl")

Save the vocab



In [ ]:

    
w = csv.writer(open("./vocabindex.csv", "w"))
for key, val in word2idx.items():
    w.writerow([key, val])



In [ ]:

    
vocab



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]: